In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Summary

scikit-learn API

X : data, 2d numpy array or scipy sparse matrix of shape (n_samples, n_features)

y : targets, 1d numpy array of shape (n_samples,)

``model.fit(X_train, [y_train])``
``model.predict(X_test)````model.transform(X_test)``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection

Model evaluation and parameter selection


In [2]:
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

digits = load_digits()
X, y = digits.data / 16., digits.target

cross_val_score(LogisticRegression(), X, y, cv=5)


Out[2]:
array([ 0.93956044,  0.91160221,  0.95264624,  0.96358543,  0.88450704])

In [3]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
grid = GridSearchCV(LogisticRegression(), param_grid={'C': np.logspace(-3, 2, 6)})
grid.fit(X_train, y_train)
grid.score(X_test, y_test)


Out[3]:
0.96666666666666667

Model complexity, overfitting, underfitting

Pipelines


In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
pipe = make_pipeline(SelectKBest(k=59), LogisticRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)


C:\Users\andy\Anaconda\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [ 0 32 39 56] are constant.
  UserWarning)
C:\Users\andy\Anaconda\lib\site-packages\sklearn\feature_selection\univariate_selection.py:112: RuntimeWarning: invalid value encountered in divide
  f = msb / msw
Out[4]:
0.97555555555555551

Scoring metrics


In [5]:
cross_val_score(LogisticRegression(C=.01), X, y == 3, cv=5)


Out[5]:
array([ 0.89722222,  0.89722222,  0.89722222,  0.90529248,  0.89944134])

In [6]:
cross_val_score(LogisticRegression(C=.01), X, y == 3, cv=5, scoring="roc_auc")


Out[6]:
array([ 0.99891222,  0.98267927,  0.9953142 ,  0.99079807,  0.91088682])

Data Wrangling


In [7]:
from sklearn.preprocessing import OneHotEncoder

X = np.array([[15.9, 1], # from Tokyo
              [21.5, 2], # from New York
              [31.3, 0], # from Paris
              [25.1, 2], # from New York
              [63.6, 1], # from Tokyo
              [14.4, 1], # from Tokyo
              ])

y = np.array([0, 1, 1, 1, 0, 0])

encoder = OneHotEncoder(categorical_features=[1], sparse=False)
pipe = make_pipeline(encoder, LogisticRegression())
pipe.fit(X, y)
pipe.score(X, y)


Out[7]:
1.0

Out-of-core Learning